rm(list=ls())
library(foreign)
library(readstata13)
library(data.table) ## v >= 1.9.6
library(magrittr)
library(survey)
library(tidyverse)

Sys.setlocale("LC_CTYPE", "es_ES.ISO8859-1")
## set your working directory to the replication folder here: 
setwd("~/Dropbox/facebook sampling/replication/mexico replication")
exports<-"~/Dropbox/facebook sampling/replication/data_exports/"

## load census data ####
munikey <- read.csv("mexico data/mexico_muni_key.csv",stringsAsFactors = FALSE)
unique(munikey[,1])
names(munikey)[1]<-"State"
munikey<-munikey%>%dplyr::mutate(State=str_trim(State,"both"))

ipums<-read.dta13('mexico data/ipumsi_00002.dta') 
states<-read.csv("mexico data/ipums_states.csv",encoding="UTF-8",header=FALSE)
states<-data.frame(as.character(states[,1]))
names(states)<-"name"
states<-states%>%
  dplyr::mutate(num=factor(as.numeric(substr(name,1,3))),
                name=as.character(name))%>%
  dplyr::mutate(name=substr(name,6,nchar(name)))

## clean up state and muni names 
ipums<-ipums%>%dplyr::mutate(munistate=paste(geo1_mx2015,geo2_mx2015,sep="."))
munikey<-munikey%>%dplyr::mutate(munistate=tolower(paste(State,Muni,sep=".")))
length(setdiff(munikey$munistate,ipums$munistate)) ## these have accents and accents have been removed in other dataset
## same with munis. -->take out accents from ipums munistate variable
ipums<-ipums%>%
  dplyr::mutate(munistate=gsub("?","e",munistate))




## census definition of each education category: 
#La educación básica:  preescolar o kínder, primaria incompleta, 
# primaria completa, estudios técnicos o comerciales con primaria terminada, 
# secundaria incompleta y secundaria completa; 
# la media superior, incluye estudios técnicos o comerciales con secundaria terminada, 
# preparatoria o bachillerato (general o tecnológico) y normal básica; 
# y la superior, incluye estudios técnicos o comerciales con preparatoria 
# terminada, profesional (licenciatura, normal superior o equivalente), 
# especialidad, maestría o doctorado.

# clean and create weights for our facebook sample
mexico <- read.csv("mexico data/mexico_facebook.csv")
mexico<-mexico[3:nrow(mexico),]

## drop levels from demographic variables 
mexico$religion<-as.character(mexico$religion)
mexico$marital.status<-as.character(mexico$marital.status)
mexico$education<-as.character(mexico$education)

#### start by cleaning up age variable 
mexico$age[grep("cumplid", mexico$age)] <- substr(mexico$age[grep("cumplid", mexico$age)],1,2)

mexico$age[which(mexico$age=="9 de agosto")] <-NA
mexico$age[which(mexico$age=="Sesenta y siete." )] <- "67"
mexico$age[which(mexico$age==",34")] <- "34"
mexico$age[which(mexico$age=="Cu")] <- NA
mexico$age[which(mexico$age=="66.")] <-"66"
mexico$age[which(mexico$age=="sesenta y siete")] <-"67"
mexico$age[which(mexico$age=="setenta y tres")] <-"73"
mexico$age[which(mexico$age=="Cincuenta"  )] <-"50"
mexico$age[which(mexico$age=="Sesenta y un")] <-"61"
mexico$age[which(mexico$age=="Cincuenta y dos")] <-"52"
mexico$age[which(mexico$age=="5u")] <- "50"
mexico$age[which(mexico$age=="Sesenta")] <- "60"
mexico$age[which(mexico$age=="66 aÅos")] <- "66"
mexico$age[which(mexico$age=="12 y 5 meses")] <- "12"
mexico$age[which(mexico$age=="Los que tenga, por que la pregunta, disculpe usted?")] <-NA
mexico$age[which(mexico$age=="QuÃ© le importa")] <-NA
mexico$age[which(mexico$age=="Setenta")] <- "70"
mexico$age[which(mexico$age=="1")] <-NA
mexico$age[which(mexico$age=="Jajajaja")] <-NA
mexico$age[which(mexico$age=="Cuarenta y nueve")] <- "20"
mexico$age[which(mexico$age=="Veintisiempre")] <-"20"
mexico$age[which(mexico$age=="81.")] <-"81"
mexico$age[which(mexico$age=="82 pero estoy consiente del planeta Tierra y me preocupan las generaciones jÃ³venes")] <-"82"
mexico$age[which(mexico$age=="5 mayo")] <-NA
mexico$age[which(mexico$age=="82 pero estoy consiente del planeta Tierra y me preocupan las generaciones jÃ³venes")] <-"82"
mexico$age[which(mexico$age=="Prefiero no contestar")] <-NA
mexico$age[which(mexico$age=="Sesenta y siete")] <- "67"
mexico$age[which(mexico$age=="asdf")] <-NA
mexico$age[which(mexico$age=="Cuarenta y cinco")] <- "45"
mexico$age[which(mexico$age=="Muchos")] <-NA
mexico$age[which(mexico$age=="En sept 15 cumplo 72")] <- "71"
mexico$age[which(mexico$age=="Xx")] <-NA
mexico$age[which(mexico$age=="Aprox 50")] <-"50"
mexico$age[which(mexico$age=="10000")] <-NA
mexico$age[which(mexico$age=="X")] <-NA
mexico$age[which(mexico$age=="197384")] <-"46"
mexico$age[which(mexico$age=="24/04/97")] <-"22"
mexico$age[which(mexico$age=="Q te importa")] <-NA
mexico$age[which(mexico$age=="10,800 dÃ­as")] <- "29"
mexico$age[which(mexico$age=="Treinta")] <-"30"
mexico$age[which(mexico$age=="22 Ä")] <-"22"
mexico$age[which(mexico$age=="65m")] <-"65"
mexico$age[which(mexico$age=="60 y ocho meses")] <-"60"
mexico$age[which(mexico$age=="Sesenta y cinco")] <-"65"
mexico$age[which(mexico$age=="Sesenta y dos")] <-"62"
mexico$age[which(mexico$age=="Setenta")] <-"70"
mexico$age[which(mexico$age=="Algunos")] <-NA
mexico$age[which(mexico$age=="53 AÃOS")] <-"53"
mexico$age[which(mexico$age=="Sesenta y ocho")] <-"68"
mexico$age[which(mexico$age=="2X")] <-NA
mexico$age[which(mexico$age=="kince")] <-"50"
mexico$age[which(mexico$age=="64 AÃOS")] <-"64"
mexico$age[which(mexico$age=="Medio siglo.")] <- "50"
mexico$age[which(mexico$age=="dicesiete")] <- "17"
mexico$age[which(mexico$age=="Sesenta y seis")] <- "76"
mexico$age[which(mexico$age=="Uytt")] <-NA
mexico$age[which(mexico$age=="77 ")] <-"77"
mexico$age[which(mexico$age=="56. Anos")] <-"56"
mexico$age[which(mexico$age=="Sesenta y cinco.")] <- "65"
mexico$age[which(mexico$age=="Sesenta y tres")] <- "63"
mexico$age[which(mexico$age=="04 / 03/ 2004")] <-"15"
mexico$age[which(mexico$age=="40+1")] <-"41"
mexico$age[which(mexico$age=="Setenta y cuatro")] <- "74"
mexico$age[which(mexico$age=="Adulto mayor")] <-NA
mexico$age[which(mexico$age=="65 para cumplir")] <-"65"
mexico$age[which(mexico$age=="Gti")] <-NA
mexico$age[which(mexico$age=="No quiero")] <-NA
mexico$age[which(mexico$age=="Ã")] <-NA
mexico$age[which(mexico$age=="159")] <-NA
mexico$age[which(mexico$age=="Setenta y dos")] <- "72"
mexico$age[which(mexico$age=="Lxiii")] <- "63"
mexico$age[which(mexico$age=="H,58")] <- "58"
mexico$age[which(mexico$age=="No sÃ© confidencialidad")] <-NA
mexico$age[which(mexico$age=="0")] <-NA
mexico$age[which(mexico$age=="63,")] <-NA
mexico$age[which(mexico$age=="No importa mas que a mi")] <-NA
mexico$age[which(mexico$age=="Prefiero no decirlo")] <-NA
mexico$age[which(mexico$age=="60.")] <-NA
mexico$age[which(mexico$age=="Cincuenta y ocho")] <-"58"
mexico$age[which(mexico$age=="\U0001f920")] <-NA
mexico$age[which(mexico$age=="65â¸")] <-NA
mexico$age[which(mexico$age=="doce")] <-12
mexico$age[which(mexico$age=="60!")] <-"60"
mexico$age[which(mexico$age==",50")] <- "50"
mexico$age[which(mexico$age=="43 7 meses")] <- "43"
mexico$age[which(mexico$age=="12/10/04")] <-"15"
mexico$age[which(mexico$age=="67 ")] <- "67"
mexico$age[which(mexico$age=="57j")] <- "57"
mexico$age[which(mexico$age=="no lo dire")] <-NA
mexico$age[which(mexico$age=="75 ")] <-NA
mexico$age[which(mexico$age=="14 septiembre")] <-NA
mexico$age[which(mexico$age==".")] <-NA
mexico$age[which(mexico$age=="Setenta y tres")] <- "73"
mexico$age[which(mexico$age=="7o")] <- "70"
mexico$age[which(mexico$age=="Cincuenta y cinco")] <- "55"
mexico$age[which(mexico$age=="54 alo")] <- "54"
mexico$age[which(mexico$age=="Hola 83")] <- "83"
mexico$age[which(mexico$age=="74.")] <- "74"
mexico$age[which(mexico$age=="Mas de 60")] <- "60"
mexico$age[which(mexico$age=="57.")] <- "57"
mexico$age[which(mexico$age=="1234")] <-NA
mexico$age[which(mexico$age=="Q te importa")] <-NA
mexico$age[which(mexico$age=="No")] <-NA
mexico$age[which(mexico$age=="P")] <-NA
mexico$age[which(mexico$age=="Ochenta")] <- "80"
mexico$age[which(mexico$age=="621")] <- "62"
mexico$age[which(mexico$age=="70 setenta")] <- "70"
mexico$age[which(mexico$age=="Setenta.")] <- "70"
mexico$age[which(mexico$age=="76.")] <- "76"
mexico$age[which(mexico$age=="54 alos")] <- "54"
mexico$age[which(mexico$age=="5")] <- "50"
mexico$age[which(mexico$age=="109")] <- NA
mexico$age[which(mexico$age=="112")] <- NA
mexico$age[which(mexico$age=="Veinticuatro")]<-"24"
mexico$age[which(mexico$age=="Sesenta y cuatro")]<-"64"
mexico$age[which(mexico$age=="Cincuenta y seis")]<-"56"
mexico$age[which(mexico$age=="Y esto de la edad como ,que")]<-NA
mexico$age[which(mexico$age=="15 :v")]<-NA
mexico$age[which(mexico$age=="No te digo")]<-NA
mexico$age[which(mexico$age=="Qti")]<-NA
mexico$age[which(mexico$age=="Que te importa")]<-NA
mexico$age[which(mexico$age=="Ochenta ycinco")]<-"85"
mexico$age[which(mexico$age=="Setenta y cinco")]<-"75"
mexico$age[grep("56",mexico$age)]<-"56"
mexico$age[which(mexico$age=="5o")]<-"50"
mexico$age[which(mexico$age=="29 de agosto del 2000")]<-"19"
mexico$age[which(mexico$age=="NosÃ±pse")]<-NA
mexico$age[which(mexico$age=="5(")]<-NA
mexico$age[which(mexico$age=="Mas de 50")]<-"50"
mexico$age[which(mexico$age==",44")]<-"44"
mexico$age[which(mexico$age=="Seventa yyres")]<-"73"
mexico$age[which(mexico$age=="69m")]<-"69"
mexico$age[which(mexico$age=="57sÃ±os")]<-"57"
mexico$age[which(mexico$age=="54 Anos")]<-"54"
mexico$age[which(mexico$age=="37,8meses")]<-"37"
mexico$age[which(mexico$age=="Kemado x el sol hillo son....48")]<-"48"
mexico$age[which(mexico$age=="5p")]<-"50"
mexico$age[which(mexico$age==",54")]<-"54"
mexico$age[which(mexico$age==",75")]<-"75"
mexico$age[which(mexico$age=="4-7")]<-"47"
mexico$age[which(mexico$age=="74...")]<-"74"
mexico$age[which(mexico$age=="Sesenta y ocho 68")]<-"68"
mexico$age[which(mexico$age=="Miedad es 83")]<-"83"
mexico$age[which(mexico$age=="10p")]<-NA
mexico$age[which(mexico$age=="3 5")]<-"35"
mexico$age[which(mexico$age=="Cuarenta y cuatro")]<-"44"
mexico$age[which(mexico$age=="46amos")]<-"46"
mexico$age[which(mexico$age=="Sesenyta y dos")]<-"62"
mexico$age[mexico$age=="Sesenta y siete"]<-"67"
mexico$age[grepl("Sesenta y cinco",mexico$age)]<-"65"
mexico$age[grepl("Sesenta y dos",mexico$age)]<-"62"
mexico$age[grepl("Sesenta",mexico$age)]<-"60"
mexico$age[grepl("kince",mexico$age)]<-"15"
mexico$age[grepl("Sesenta y un",mexico$age)]<-"61"
mexico$age[grepl("Cincuenta y dos",mexico$age)]<-"52"
mexico$age[grepl("Sesenta y seis ",mexico$age)]<-"66"
mexico$age[grepl("Sesenta y ocho",mexico$age)]<-"68"
mexico$age[grepl("Setenta y cuatro",mexico$age)]<-"74"
mexico$age[grepl("Cincuenta y cinco",mexico$age)]<-"55"
mexico$age[grepl("doce",mexico$age)]<-"12"
mexico$age[grepl("Ochenta",mexico$age)]<-"80"


mexico$age[grep("a?os", mexico$age)] 
mexico$age2<-parse_number(mexico$age)

mexico$ageselfreport <- mexico$age2

## Facebook appears to have pushed to some underage respondents, we drop these immediately
mexico <- mexico[-which(mexico$ageselfreport<18),]
table(mexico$ageselfreport)
mexico$ageselfreport[which(mexico$ageselfreport>100)]<-NA
mexico$age<-NULL
mexico$age2<-NULL


# missing a group for someone, drop them
mexico <- mexico[-which(mexico$group==""),]

# fixing some problems with the "group" variable (each of these was an error in the URL)

# first the URL group for Centro 1a F 60+ was wrong, and incorrectly also labeled these as M
mexico$group <- as.character(mexico$group)
table(mexico$group)
mexico$group[which(mexico$group=="centro1a.M.60" & mexico$sex=="Mujer")]  <- "centro1a.F.60" # 20 is the right number for this cell

#same
mexico$group[which(mexico$group=="centro2a.M.60" & mexico$sex=="Mujer")]  <- "centro2a.F.60" # 31 is the right number for this cell

#same
mexico$group[which(mexico$group=="occidente1b.M.60" & mexico$sex=="Mujer")]  <- "occidente1b.F.60"   # one problem gender is getting reclassified here, FYI, should only be 12 in this cell, but there are 13, so one in the Male cell reported themselves a Female or was Female

#same
mexico$group[which(mexico$group=="norte4a.M.60" & mexico$sex=="Mujer")] <- "norte4a.F.60"  # one problem gender is getting reclassified here, FYI, should only be 12 in this cell, but there are 13, so one in the Male cell reported themselves a Female or was Female

# a few weirder data entry mix-ups to fix
mexico$group[which(mexico$group=="sur1d.M.50to59" & mexico$sex=="Mujer")] <- "sur1d.F.60" # one problem gender is getting reclassified here, FYI, should only be 12 in this cell, but there are 13, so one in the Male cell reported themselves a Female or was Female

mexico$group[which(mexico$group=="sur1d.F.60"&mexico$sex=="Hombre")] <- "sur1d.M.60" # there should be 9, but are only 7.


# wrong age on this group
mexico$group[which(mexico$group=="norte2a.M.18to29" & mexico$age>29)] <- "norte2a.M.30to49" # a bit of error introduced here, as this is a few more than is the case, but no way to sort this out retrospectively

# wrong age on one subset here
mexico$group[which(mexico$group=="norte2a.M.18to29" & mexico$age>29)] <- "norte2a.M.30to49" # a bit of error introduced here, as this is a few more than is the case, but no way to sort this out retrospectively


mexico$age[which(mexico$cell=="sur1.M.50to59")] <- "norte4a.F.60"




# collapse low population cells
mexico$cell <- mexico$group
mexico$cell <- gsub("sur1a", "sur1",mexico$cell)
mexico$cell <- gsub("sur1b", "sur1",mexico$cell)
mexico$cell <- gsub("sur1c", "sur1",mexico$cell)
mexico$cell <- gsub("sur1d", "sur1",mexico$cell)
mexico$cell <- gsub("sur1e", "sur1",mexico$cell)

mexico$cell <- gsub("centro1a", "centro1",mexico$cell)
mexico$cell <- gsub("centro1b", "centro1",mexico$cell)

mexico$cell <- gsub("norte1a", "norte1",mexico$cell)
mexico$cell <- gsub("norte1b", "norte1",mexico$cell)

mexico$cell <- gsub("occidente1a", "occidente1",mexico$cell)
mexico$cell <- gsub("occidente1b", "occidente1",mexico$cell)

## recode demographics 
mexico%<>%
  mutate(genderFB=case_when(grepl(".M.",cell)~"M",
                            grepl(".F.",cell)~"F"),
         ageFB=case_when(grepl("18to29",cell)~"18to29",
                        grepl("30to49",cell)~"30to49",
                        grepl("50to59",cell)~"50to59",
                        grepl("60",cell)~ "60"),
         Stateselfreport=as.character(Q40_1),
         Muniselfreport=as.character(Q40_2))

munikey$Muni <- as.character(munikey$Muni)
munikey$State <- as.character(munikey$State)

## make sure all munis and states match
setdiff(mexico$Stateselfreport,munikey$State)
setdiff(mexico$Muniselfreport,munikey$Muni)
rev(sort(unique(munikey$Muni)))
munikey[munikey$Muni==rev(sort(unique(munikey$Muni)))[48],"Muni"]<-"Zacatepec de Hidalgo"

mexico <- left_join(mexico, munikey, by=c("Stateselfreport"="State","Muniselfreport"="Muni"))

mexico%<>%
  mutate(geoFB=case_when(grepl("sur1",cell.x)~"sur1",
                         grepl("sur2",cell.x)~"sur2a",
                         grepl("sur3",cell.x)~ "sur3a",
                         grepl("sur4",cell.x)~ "sur4a",
                         grepl("centro1",cell.x)~"centro1",
                         grepl("centro2",mexico$cell.x)~ "centro2a",
                         grepl("centro3",cell.x)~ "centro3a",
                         grepl("centro4",cell.x)~ "centro4a",
                         grepl("occidente1",cell.x)~ "occidente1",
                         grepl("occidente2",cell.x)~ "occidente2a",
                         grepl("occidente3",cell.x)~ "occidente3a",
                         grepl("occidente4",cell.x)~ "occidente4a",
                         grepl("norte1",cell.x)~ "norte1",
                         grepl("norte2",cell.x)~ "norte2a",
                         grepl("norte3",cell.x)~ "norte3a",
                         grepl("norte4",cell.x)~ "norte4a"),
         geoselfreport=case_when(grepl("SUR 1",cell.y)~ "sur1",
                                grepl("SUR 2",cell.y)~ "sur2a",
                                grepl("SUR 3",cell.y)~ "sur3a",
                                grepl("SUR 4",cell.y)~ "sur4a",
                                grepl("CENTRO 1",cell.y)~ "centro1",
                                grepl("CENTRO 2",cell.y)~ "centro2a",
                                grepl("CENTRO 3",cell.y)~ "centro3a",
                                grepl("CENTRO 4",cell.y)~ "centro4a",
                                grepl("CENTRO OCCIDENTE 1",cell.y)~ "occidente1",
                                grepl("CENTRO OCCIDENTE 2",cell.y)~ "occidente2a",
                                grepl("CENTRO OCCIDENTE 3",cell.y)~ "occidente3a",
                                grepl("CENTRO OCCIDENTE 4",cell.y)~ "occidente4a",
                                grepl("NORTE 1",cell.y)~ "norte1",
                                grepl("NORTE 2",cell.y)~ "norte2a",
                                grepl("NORTE 3",cell.y)~ "norte3a",
                                grepl("NORTE 4",cell.y)~ "norte4a"),
         regionFB=case_when(grepl("norte",geoFB)~ "norte",
                            grepl("sur",geoFB)~ "sur",
                            grepl("occidente",geoFB)~ "occidente",
                            grepl("centro",geoFB)~ "centro"),
         regionselfreport =case_when(grepl("norte",geoselfreport)~ "norte",
                                    grepl("sur",geoselfreport)~ "sur",
                                    grepl("occidente",geoselfreport)~ "occidente",
                                    grepl("centro",geoselfreport)~ "centro"))


## pull out and save incompletes as a separate dataset 
mexico%<>%
  mutate(complete=ifelse(!is.na(ageselfreport)&!is.na(education)&education!=""&!is.na(sex)&Stateselfreport!=""&Muniselfreport!=""&!is.na(env1c2)&!is.na(env2b2)&!is.na(policy),
                         "complete","incomplete"))
## complete responses need age, education, and gender self reports, along with geographic id's and at least 1 substantive question .

incompletes<-mexico%>%
  filter(complete=="incomplete")
incompletes%<>%select(Duration..in.seconds.,StartDate,ResponseId,complete,genderFB,
                      ageFB,geoFB,education,regionFB,cell.x)
incompletes%<>%
  mutate(edselfreport=case_when(education==sort(unique(mexico$education))[1]~"secondary",
                                education=="Ninguno"~"nosecondary",
                                education=="Primaria"~"nosecondary",
                                education=="Secundaria"~"secondary",
                                education=="Superior de Universitaria"~"postsecondary",
                                education=="Universitaria"~"postsecondary"))%>%
  select(-education)

mexico<-mexico[mexico$complete=="complete",]

### create self-reported cells
mexico%<>%
  mutate(genderselfreport=case_when(sex=="Hombre"~ "M",
                                     sex=="Mujer"~"F"),
         cellselfreport=paste0(geoselfreport,".",genderselfreport,".",age),
         cellselfreport=ifelse(grepl("NA",cellselfreport),NA,cellselfreport))

table(table(mexico$cellselfreport))
median(table(mexico$cellselfreport))
quantile(table(mexico$cellselfreport))

## recode policy question to get frequency of individual items. will use svymean on binary responses
mexico$env1c2<-as.numeric(as.character(mexico$env1c2))
mexico$env2b2<-as.numeric(as.character(mexico$env2b2))
mexico$policy<-as.character(mexico$policy)
mexico<-mexico%>%
  dplyr::mutate(rich.countries=ifelse(grepl("ricos",policy)==TRUE,1,0),
                world.powers=ifelse(grepl("potencias mundiales",policy)==TRUE,1,0),
                citizens=ifelse(grepl("ciudadanos",policy)==TRUE,1,0),
                poor.countries=ifelse(grepl("pobres",policy)==TRUE,1,0),
                governments=ifelse(grepl("gobiernos",policy)==TRUE,1,0),
                international.organizations=ifelse(grepl("organismos",policy)==TRUE,1,0),
                business=ifelse(grepl("empresas",policy)==TRUE,1,0),
                scientists=ifelse(grepl("cient",policy)==TRUE,1,0),
                environmental.organizations=ifelse(grepl("ambientalistas",policy)==TRUE,1,0),
                myself.family.and.friends=ifelse(grepl("familia",policy)==TRUE,1,0),
                everybody=ifelse(grepl("Todos",policy)==TRUE,1,0),
                nobody=ifelse(grepl("Nadie",policy)==TRUE,1,0),
                dk=ifelse(grepl("No lo",policy)==TRUE,1,0)
  )


# set up raked weights #### 
# first calculate the proportion of the Mexican population in each cell

## create counts of each demographic group in each cell
cell_counts <- munikey %>%
  filter(cell!="")%>%
  group_by(cell) %>%
  dplyr::summarize(F1829.tot = sum(F1529)*12/15, ## b/c age groups are defined as 15-29 in Census. adjust to be 18-29.
                   F3049.tot = sum(F3049),
                   F5059.tot = sum(F5059),
                   F6065.tot = sum(F6064)+sum(F65),
                   M1829.tot = sum(M1529)*12/15,
                   M3049.tot = sum(M3049),
                   M5059.tot = sum(M5059),
                   M6065.tot = sum(M6064)+sum(M65))

cell_counts <- as.data.frame(cell_counts)
cell_counts$cell <- as.character(cell_counts$cell)
## combine subdivided cells back into one cell. 
unique(cell_counts$cell)
cell_counts<-cell_counts%>%
  mutate(cell=tolower(cell),
         cell=case_when(
           cell=="centro 1a"|cell=="centro 1b"~"centro 1",
           cell=="centro occidente 1a"|cell=="centro occidente 1b"~"occidente 1",
           cell=="norte 1a"|cell=="norte 1b"~"norte 1",
           cell=="sur 1a"|cell=="sur 1b"|cell=="sur 1c"|cell=="sur 1d"|cell=="sur 1e"~"sur 1",
           TRUE~cell
         ),
         cell=gsub("centro occidente","occidente",cell))%>%
  group_by(cell)%>%
  summarise_all(sum)
cell_counts<-cell_counts%>%filter(cell!="")

### create marginal region, gender, education, and age distributions
region<-cell_counts%>%
  pivot_longer(-cell,
               values_to="count",
               names_to="genderxage")%>%
  mutate(cell=gsub(" ","",cell))%>%
  group_by(cell)%>%
  summarise(Freq=sum(count))

gender<-cell_counts%>%
  pivot_longer(-cell,
               values_to="count",
               names_to="genderxage")%>%
  mutate(genderxage=substr(genderxage,1,1),
         cell=gsub(" ","",cell))%>%
  group_by(genderxage)%>%
  summarise(Freq=sum(count))%>%
  rename(genderselfreport=genderxage)
## population is: 72481931
sum(gender$Freq)

age<-cell_counts%>%
  pivot_longer(-cell,
               values_to="count",
               names_to = "agegroupselfreport")%>%
  mutate(agegroupselfreport=parse_number(agegroupselfreport),
         cell=gsub(" ","",cell))%>%
  group_by(agegroupselfreport)%>%
  summarise(Freq=sum(count))

# ## load education data
edu<-read.csv("mexico data/Educacion_04.csv",stringsAsFactors = FALSE)
edu2<-dcast(setDT(edu), State ~ education, value.var = c("F", "M")) 
## change headers to match names in survey. 
names(edu2)<-c("State","F.basic",
               "F.media",
               "F.superior",
               "F.unspecified",## this one is not included in our survey.
               "F.none",
               "M.basic",
               "M.media",
               "M.superior",
               "M.unspecified",## this one is not included in our survey.
               "M.none")
munikey2<-left_join(munikey,edu2,by="State")
names(munikey2)
munikey2[,29:38]<-munikey2[,29:38]/100

## find female and male populations and convert percentages to counts
fedprop<-munikey2%>%
  select(munistate,F.basic,F.media,F.superior,F.unspecified,F.none)
medprop<-munikey2%>%
  select(munistate,M.basic,M.media,M.superior,M.unspecified,M.none)
f.ed<-munikey2%>%
  select(munistate,cell,F1529,F3049,F5059,F6064,F65)%>%
  pivot_longer(cols=c(F1529,F3049,F5059,F6064,F65),
               values_to = 'f.ed',
               names_to = 'group')%>%
  group_by(munistate,cell)%>%
  summarise(pop=sum(f.ed))%>%
  ungroup()%>%
  left_join(fedprop,by="munistate")%>%
  mutate_at(c('F.basic','F.media','F.superior','F.unspecified','F.none'),~.*pop)
names(f.ed)[4:ncol(f.ed)]<-gsub("F.","",names(f.ed)[4:ncol(f.ed)])

m.ed<-munikey2%>%
  select(munistate,cell,M1529,M3049,M5059,M6064,M65)%>%
  pivot_longer(cols=c(M1529,M3049,M5059,M6064,M65),
               values_to = 'm.ed',
               names_to = 'group')%>%
  group_by(munistate,cell)%>%
  summarise(pop=sum(m.ed))%>%
  ungroup()%>%
  left_join(medprop,by="munistate")%>%
  mutate_at(c('M.basic','M.media','M.superior','M.unspecified','M.none'),~.*pop)
names(m.ed)[4:ncol(m.ed)]<-gsub("M.","",names(m.ed)[4:ncol(m.ed)])


ed<-rbind(m.ed,f.ed)%>%
  group_by(munistate,cell)%>%
  summarise_all(sum)%>%
  ungroup()


ed<-ed%>%
  mutate(cell=tolower(cell),
         cell=case_when(
           cell=="centro 1a"~"centro1",
           cell=="centro occidente 1a"|cell=="centro occidente 1b"~"centro occidente 1",
           cell=="norte 1a"|cell=="norte 1b"~"norte 1",
           cell=="sur 1a"|cell=="sur 1b"|cell=="sur 1c"|cell=="sur 1d"|cell=="sur 1e"~"sur 1",
           TRUE~cell
         ))%>%
  dplyr::select(-munistate,-pop)%>%
  filter(!is.na(cell)&cell!="")%>%
  pivot_longer(cols=basic:none,
               values_to='count',
               names_to="edselfreport")%>%
  mutate(edselfreport=case_when(edselfreport=="media"~"secondary",
                                edselfreport=="basic"~"nosecondary",
                                edselfreport=="none"~"nosecondary",
                                edselfreport=="superior"~"postsecondary",
                                edselfreport=="unspecified"~"unspecified"))%>%
  group_by(edselfreport)%>%
  summarise(Freq=sum(count,na.rm=TRUE))%>%
  filter(edselfreport!="unspecified")
head(ed)

regiontotal<-sum(region$Freq)
edtotal<-sum(ed$Freq)
agetotal<-sum(age$Freq)
gendertotal<-sum(gender$Freq)

region<-region%>%mutate(Freq=nrow(mexico)*Freq/regiontotal)

ed<-ed%>%
  mutate(Freq=nrow(mexico)*Freq/edtotal)
age<-age%>%mutate(Freq=nrow(mexico)*Freq/agetotal)
gender<-gender%>%mutate(Freq=nrow(mexico)*Freq/gendertotal)

sort(unique(mexico$edselfreport))
sort(unique(ed$edselfreport))
unique(mexico$genderselfreport)
unique(gender$genderselfreport)
mexico<-mexico%>%
  mutate(edselfreport=case_when(education==sort(unique(mexico$education))[1]~"secondary",
                      education=="Ninguno"~"nosecondary",
                      education=="Primaria"~"nosecondary",
                      education=="Secundaria"~"secondary",
                      education=="Superior de Universitaria"~"postsecondary",
                      education=="Universitaria"~"postsecondary"),
         agegroupselfreport=case_when(ageselfreport<30~"1829",
                                      ageselfreport>29&ageselfreport<50~"3049",
                                      ageselfreport>49&ageselfreport<60~"5059",
                                      ageselfreport>59~"6065"))
setdiff(mexico$agegroupselfreport,age$agegroupselfreport)
mexico<-mexico[!is.na(mexico$ageselfreport),]
setdiff(mexico$genderselfreport,gender$genderselfreport)
mexico<-mexico[mexico$genderselfreport!="NA",]
setdiff(mexico$edselfreport,ed$edselfreport)

# View(mexico[is.na(mexico$cell.y),]) ## empty muni columns-->can't match on muni key
unique(mexico$cell.y)

## recombine cells that were split in facebook ads
mexico<-mexico%>%
  separate(cell.x,c("cell.fb","gender.fb","age.fb"),sep="[.]")%>%
  mutate(cell.self=gsub(" ","",tolower(cell.y)))%>%
  mutate(cell.self=case_when(cell.self=="centro1a"|cell.self=="centro1b"~"centro1",
                             cell.self=="centrooccidente1a"|cell.self=="centrooccidente1b"~"occidente1",
                             cell.self=="norte1a"|cell.self=="norte1b"~"norte1",
                             cell.self=="sur1a"|cell.self=="sur1b"|cell.self=="sur1c"|cell.self=="sur1d"|cell.self=="sur1e"~"sur1",
                             TRUE~cell.self),
         cell.self=gsub("centrooccidente","occidente",cell.self)
  )
setdiff(mexico$cell.fb,region$cell)
setdiff(mexico$cell.fb,mexico$cell.self)
nrow(mexico[is.na(mexico$cell.self),]) ## 39 respondents who don't fit into a cell b/c they don't have a municipality in Mexico. 
nrow(mexico[mexico$cell.fb!=mexico$cell.self,]) ## 1754 people are in a different region than the one FB advertised to. 

## set up weighted surveys. Use people's self-reported geography
mexico2<-mexico[!is.na(mexico$cell.self)&mexico$cell.self!="",]
names(mexico2)[names(mexico2)=="cell.self"]<-"cell"

# save weighting groups
save(ed,gender,age,region,file="mexico data/weightingdata_mexico.Rda")


## create weights #### 
design<-svydesign(ids=~0,data=mexico2)
rake1<-rake(design=design,
            sample.margins = list(~edselfreport,~genderselfreport,~agegroupselfreport,~cell
            ),
            population=list(ed,gender,age,region
            ),control=list(maxit=100)) 
quantile(weights(rake1),probs=c(.05,.5,.95))

mexico2$weight_untrimmed<-mexico2$weight

rake1 <- trimWeights(rake1, lower=-Inf, upper=quantile(weights(rake1),probs=c(.95)),strict=TRUE)
mexico2$weight<-weights(rake1)
mexico2$id<-rownames(mexico2)


mexico<-mexico2
write_csv(mexico,file="mexico data/mexico_facebook_cleaned.csv")
write_csv(mexico,file="mexico data/mexico_facebook_incompletes.csv")
save(mexico,incompletes,file="mexico data/mexico_facebook_cleaned.Rda")






